suppressPackageStartupMessages(library(tidyverse))
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tidyr' was built under R version 4.2.3
## Warning: package 'readr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'stringr' was built under R version 4.2.3
devtools::load_all('~/Google Drive/My Drive/Scripts/R_packages/myUtilities/')
## ℹ Loading myUtilities
wd <- "/Users/s-mitsutomi/My Drive (shuheimitsutomi@ric.u-tokyo.ac.jp)/Analysis/METTL2A/"
setwd(wd)
tabledir <- paste0(wd, 'Tables/DRS_m3C_sites/')
figdir <- paste0(wd, 'Figures/DRS_m3C_sites/Chrs/')
read_methylated_position_tsv <- function(path) {
read_tsv(
path, col_names = c('position', 'kmer'),
show_col_types = F
) |>
separate(position, into = c('transcript_id', 'position'), sep = '[|]')
}
#’ # Read methylated position information and add annotation
espresso_annotation <-
read_tsv(paste0(wd, 'Tables/Espresso_AsPC1_annotation_cleaned_2024-03-29.tsv'))
## Rows: 36717 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (12): seqname, source, feature, score, strand, frame, gene_id, transcrip...
## dbl (2): start, end
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
espresso_annotation
## # A tibble: 36,717 × 14
## seqname source feature start end score strand frame gene_id transcript_id
## <chr> <chr> <chr> <dbl> <dbl> <chr> <chr> <chr> <chr> <chr>
## 1 chr3 annot… transc… 3.15e6 3.15e6 . - . ENSG00… ENST00000498…
## 2 chr3 annot… transc… 3.15e6 3.15e6 . - . ENSG00… ENST00000459…
## 3 chr3 annot… transc… 3.15e6 3.18e6 . - . ENSG00… ENST00000231…
## 4 chr3 annot… transc… 3.15e6 3.18e6 . - . ENSG00… ENST00000432…
## 5 chr3 annot… transc… 3.13e6 3.13e6 . + . ENSG00… ENST00000339…
## 6 chr3 annot… transc… 3.15e6 3.16e6 . - . ENSG00… ENST00000488…
## 7 chr3 annot… transc… 3.13e6 3.13e6 . + . ENSG00… ENST00000420…
## 8 chr3 annot… transc… 3.14e6 3.15e6 . + . ENSG00… ENST00000698…
## 9 chr3 annot… transc… 3.17e6 3.18e6 . - . ENSG00… ENST00000450…
## 10 chr3 annot… transc… 3.15e6 3.15e6 . + . ENSG00… ENST00000698…
## # ℹ 36,707 more rows
## # ℹ 4 more variables: gene_type <chr>, gene_name <chr>, transcript_type <chr>,
## # transcript_name <chr>
methylated_positions <-
read_tsv(
paste0(wd, 'Tables/DRS/Positions/intensityup_common_2024-04-10.tsv.gz')
)
## Rows: 605 Columns: 65
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (30): transcript_id, transcript_name, ref_kmer, GMM_cov_type_G, cluster_...
## dbl (35): position, GMM_logit_pvalue_G, KS_dwell_pvalue_G, KS_intensity_pval...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
methylated_positions
## # A tibble: 605 × 65
## transcript_id transcript_name position ref_kmer GMM_logit_pvalue_G
## <chr> <chr> <dbl> <chr> <dbl>
## 1 ENST00000429711.7 RPL32-204 422 GCCCA 1
## 2 ENST00000647248.2 RPL35A-211 380 ACCCC 1
## 3 ENST00000647248.2 RPL35A-211 381 CCCCT 1
## 4 ENST00000389680.2 MT-RNR1-201 43 ACACA 1
## 5 ENST00000389680.2 MT-RNR1-201 57 CCCCG 1
## 6 ENST00000389680.2 MT-RNR1-201 71 GTTCA 1
## 7 ENST00000389680.2 MT-RNR1-201 73 TCACC 1
## 8 ENST00000389680.2 MT-RNR1-201 75 ACCCT 0.777
## 9 ENST00000389680.2 MT-RNR1-201 93 ATCAA 1
## 10 ENST00000389680.2 MT-RNR1-201 138 GCTTA 1
## # ℹ 595 more rows
## # ℹ 60 more variables: KS_dwell_pvalue_G <dbl>, KS_intensity_pvalue_G <dbl>,
## # GMM_cov_type_G <chr>, GMM_n_clust_G <dbl>, cluster_counts_G <chr>,
## # Logit_LOR_G <dbl>, c1_mean_intensity_G <dbl>, c2_mean_intensity_G <dbl>,
## # c1_median_intensity_G <dbl>, c2_median_intensity_G <dbl>,
## # c1_sd_intensity_G <dbl>, c2_sd_intensity_G <dbl>, c1_mean_dwell_G <dbl>,
## # c2_mean_dwell_G <dbl>, c1_median_dwell_G <dbl>, c2_median_dwell_G <dbl>, …
calc_percentage <- function(df) {
df |>
reframe(n = n()) |>
mutate(percentage = 100 * n / sum(n)) |>
arrange(-percentage)
}
methylated_positions |>
group_by(grepl('.{2}C.{2}', ref_kmer)) |>
calc_percentage()
## # A tibble: 2 × 3
## `grepl(".{2}C.{2}", ref_kmer)` n percentage
## <lgl> <int> <dbl>
## 1 TRUE 489 80.8
## 2 FALSE 116 19.2
methylated_positions |>
filter(grepl('.{2}C.{2}', ref_kmer)) |>
group_by(seqname) |>
calc_percentage()
## # A tibble: 24 × 3
## seqname n percentage
## <chr> <int> <dbl>
## 1 chrM 229 46.8
## 2 chr12 47 9.61
## 3 chr16 41 8.38
## 4 chr1 34 6.95
## 5 chr11 28 5.73
## 6 chr2 19 3.89
## 7 chr7 16 3.27
## 8 chr19 14 2.86
## 9 chr5 9 1.84
## 10 chr15 6 1.23
## # ℹ 14 more rows
methylated_positions |>
filter(!grepl('.{2}C.{2}', ref_kmer)) |>
group_by(seqname) |>
calc_percentage()
## # A tibble: 17 × 3
## seqname n percentage
## <chr> <int> <dbl>
## 1 chrM 47 40.5
## 2 chr11 17 14.7
## 3 chr1 9 7.76
## 4 chr12 7 6.03
## 5 chr10 6 5.17
## 6 chr16 5 4.31
## 7 chr5 4 3.45
## 8 chr13 3 2.59
## 9 chr14 3 2.59
## 10 chr2 3 2.59
## 11 KN196481.1 2 1.72
## 12 chr17 2 1.72
## 13 chr19 2 1.72
## 14 chr4 2 1.72
## 15 chr7 2 1.72
## 16 GL000251.2 1 0.862
## 17 chrX 1 0.862
methylated_positions |>
group_by(grepl('C', ref_kmer)) |>
calc_percentage()
## # A tibble: 2 × 3
## `grepl("C", ref_kmer)` n percentage
## <lgl> <int> <dbl>
## 1 TRUE 592 97.9
## 2 FALSE 13 2.15
methylated_positions |>
filter(grepl('C', ref_kmer)) |>
group_by(seqname) |>
calc_percentage()
## # A tibble: 25 × 3
## seqname n percentage
## <chr> <int> <dbl>
## 1 chrM 273 46.1
## 2 chr12 52 8.78
## 3 chr16 46 7.77
## 4 chr11 44 7.43
## 5 chr1 43 7.26
## 6 chr2 22 3.72
## 7 chr19 16 2.70
## 8 chr7 16 2.70
## 9 chr5 12 2.03
## 10 chr4 7 1.18
## # ℹ 15 more rows
methylated_positions_center_C <-
methylated_positions |>
filter(grepl('.{2}C.{2}', ref_kmer))
methylated_positions_center_C
## # A tibble: 489 × 65
## transcript_id transcript_name position ref_kmer GMM_logit_pvalue_G
## <chr> <chr> <dbl> <chr> <dbl>
## 1 ENST00000429711.7 RPL32-204 422 GCCCA 1
## 2 ENST00000647248.2 RPL35A-211 380 ACCCC 1
## 3 ENST00000647248.2 RPL35A-211 381 CCCCT 1
## 4 ENST00000389680.2 MT-RNR1-201 57 CCCCG 1
## 5 ENST00000389680.2 MT-RNR1-201 75 ACCCT 0.777
## 6 ENST00000389680.2 MT-RNR1-201 93 ATCAA 1
## 7 ENST00000389680.2 MT-RNR1-201 148 GCCAC 1
## 8 ENST00000389680.2 MT-RNR1-201 153 ACCCC 1
## 9 ENST00000389680.2 MT-RNR1-201 154 CCCCC 1
## 10 ENST00000389680.2 MT-RNR1-201 155 CCCCA 1
## # ℹ 479 more rows
## # ℹ 60 more variables: KS_dwell_pvalue_G <dbl>, KS_intensity_pvalue_G <dbl>,
## # GMM_cov_type_G <chr>, GMM_n_clust_G <dbl>, cluster_counts_G <chr>,
## # Logit_LOR_G <dbl>, c1_mean_intensity_G <dbl>, c2_mean_intensity_G <dbl>,
## # c1_median_intensity_G <dbl>, c2_median_intensity_G <dbl>,
## # c1_sd_intensity_G <dbl>, c2_sd_intensity_G <dbl>, c1_mean_dwell_G <dbl>,
## # c2_mean_dwell_G <dbl>, c1_median_dwell_G <dbl>, c2_median_dwell_G <dbl>, …
methylated_positions_C <-
methylated_positions |>
group_by(grepl('C', ref_kmer))
methylated_positions_C
## # A tibble: 605 × 66
## # Groups: grepl("C", ref_kmer) [2]
## transcript_id transcript_name position ref_kmer GMM_logit_pvalue_G
## <chr> <chr> <dbl> <chr> <dbl>
## 1 ENST00000429711.7 RPL32-204 422 GCCCA 1
## 2 ENST00000647248.2 RPL35A-211 380 ACCCC 1
## 3 ENST00000647248.2 RPL35A-211 381 CCCCT 1
## 4 ENST00000389680.2 MT-RNR1-201 43 ACACA 1
## 5 ENST00000389680.2 MT-RNR1-201 57 CCCCG 1
## 6 ENST00000389680.2 MT-RNR1-201 71 GTTCA 1
## 7 ENST00000389680.2 MT-RNR1-201 73 TCACC 1
## 8 ENST00000389680.2 MT-RNR1-201 75 ACCCT 0.777
## 9 ENST00000389680.2 MT-RNR1-201 93 ATCAA 1
## 10 ENST00000389680.2 MT-RNR1-201 138 GCTTA 1
## # ℹ 595 more rows
## # ℹ 61 more variables: KS_dwell_pvalue_G <dbl>, KS_intensity_pvalue_G <dbl>,
## # GMM_cov_type_G <chr>, GMM_n_clust_G <dbl>, cluster_counts_G <chr>,
## # Logit_LOR_G <dbl>, c1_mean_intensity_G <dbl>, c2_mean_intensity_G <dbl>,
## # c1_median_intensity_G <dbl>, c2_median_intensity_G <dbl>,
## # c1_sd_intensity_G <dbl>, c2_sd_intensity_G <dbl>, c1_mean_dwell_G <dbl>,
## # c2_mean_dwell_G <dbl>, c1_median_dwell_G <dbl>, c2_median_dwell_G <dbl>, …
methylated_positions_groupedby_chr <-
methylated_positions |>
group_by(seqname) |>
calc_percentage()
methylated_positions_groupedby_chr
## # A tibble: 25 × 3
## seqname n percentage
## <chr> <int> <dbl>
## 1 chrM 276 45.6
## 2 chr12 54 8.93
## 3 chr16 46 7.60
## 4 chr11 45 7.44
## 5 chr1 43 7.11
## 6 chr2 22 3.64
## 7 chr7 18 2.98
## 8 chr19 16 2.64
## 9 chr5 13 2.15
## 10 chr4 8 1.32
## # ℹ 15 more rows
methylated_positions_center_C_groupedby_chr <-
methylated_positions_center_C |>
group_by(seqname) |>
calc_percentage()
methylated_positions_center_C_groupedby_chr
## # A tibble: 24 × 3
## seqname n percentage
## <chr> <int> <dbl>
## 1 chrM 229 46.8
## 2 chr12 47 9.61
## 3 chr16 41 8.38
## 4 chr1 34 6.95
## 5 chr11 28 5.73
## 6 chr2 19 3.89
## 7 chr7 16 3.27
## 8 chr19 14 2.86
## 9 chr5 9 1.84
## 10 chr15 6 1.23
## # ℹ 14 more rows
methylated_positions_C_groupedby_chr <-
methylated_positions_C |>
group_by(seqname) |>
calc_percentage()
methylated_positions_C_groupedby_chr
## # A tibble: 25 × 3
## seqname n percentage
## <chr> <int> <dbl>
## 1 chrM 276 45.6
## 2 chr12 54 8.93
## 3 chr16 46 7.60
## 4 chr11 45 7.44
## 5 chr1 43 7.11
## 6 chr2 22 3.64
## 7 chr7 18 2.98
## 8 chr19 16 2.64
## 9 chr5 13 2.15
## 10 chr4 8 1.32
## # ℹ 15 more rows
calc_percentage_chrM <- function(df) {
df |>
group_by(seqname == 'chrM') |>
calc_percentage() |>
ungroup() |>
rename(isChrM = `seqname == "chrM"`)
}
methylated_positions_groupedby_chrMornot <-
methylated_positions |>
calc_percentage_chrM()
methylated_positions_groupedby_chrMornot
## # A tibble: 2 × 3
## isChrM n percentage
## <lgl> <int> <dbl>
## 1 FALSE 329 54.4
## 2 TRUE 276 45.6
methylated_positions_center_C_groupedby_chrMornot <-
methylated_positions_center_C |>
calc_percentage_chrM()
methylated_positions_center_C_groupedby_chrMornot
## # A tibble: 2 × 3
## isChrM n percentage
## <lgl> <int> <dbl>
## 1 FALSE 260 53.2
## 2 TRUE 229 46.8
methylated_positions_C_groupedby_chrMornot <-
methylated_positions_C |>
calc_percentage_chrM()
methylated_positions_C_groupedby_chrMornot
## # A tibble: 2 × 3
## isChrM n percentage
## <lgl> <int> <dbl>
## 1 FALSE 329 54.4
## 2 TRUE 276 45.6
add_yrange <- function(df) {
new_df <- df |>
mutate(ymax = cumsum(percentage / 100))
new_df$ymin <- c(0, head(new_df$ymax, n = -1))
return(new_df)
}
donutplot_chrM <- function(df) {
df |>
add_yrange() |>
ggplot(aes(
xmin = 2, xmax = 4, ymin = ymin, ymax = ymax,
fill = isChrM, colour = isChrM
)) +
geom_rect() +
coord_polar(theta = 'y') +
ggrepel::geom_text_repel(
aes(label = isChrM, y = (ymin + ymax) / 2), x = 1
) +
xlim(c(-1,4)) +
scale_fill_manual(values = c('blue', 'red')) +
scale_color_manual(values = c('blue', 'red')) +
theme_void()
}
methylated_positions_groupedby_chrMornot_donutplot <-
methylated_positions_groupedby_chrMornot |>
donutplot_chrM()
methylated_positions_groupedby_chrMornot_donutplot |>
ggsave_multiple_formats(
width = 5, height = 5, fontsize = 7, outdir = figdir)
methylated_positions_C_groupedby_chrMornot_donutplot <-
methylated_positions_C_groupedby_chrMornot |>
donutplot_chrM()
methylated_positions_C_groupedby_chrMornot_donutplot |>
ggsave_multiple_formats(
width = 5, height = 5, fontsize = 7, outdir = figdir)
methylated_positions_center_C_groupedby_chrMornot_donutplot <-
methylated_positions_center_C_groupedby_chrMornot |>
donutplot_chrM()
methylated_positions_center_C_groupedby_chrMornot_donutplot |>
ggsave_multiple_formats(
width = 5, height = 5, fontsize = 7, outdir = figdir)
methylated_positions_groupedby_chr |>
ggplot(aes(x = reorder(seqname, n), y = n)) +
geom_bar(stat = 'identity') +
coord_flip()
num_detected_transcripts_in_chromosomes <-
espresso_annotation |>
select(seqname, transcript_id) |>
distinct() |>
group_by(seqname) |>
reframe(num_detected_transcripts_in_chr = n()) |>
arrange(-num_detected_transcripts_in_chr)
num_detected_transcripts_in_chromosomes
## # A tibble: 69 × 2
## seqname num_detected_transcripts_in_chr
## <chr> <int>
## 1 chr1 3605
## 2 chr2 2776
## 3 chr11 2422
## 4 chr17 2237
## 5 chr19 2236
## 6 chr7 2146
## 7 chr3 2121
## 8 chr12 2105
## 9 chr16 1906
## 10 chr5 1733
## # ℹ 59 more rows
num_sites_in_transcripts <-
methylated_positions |>
group_by(
seqname, transcript_id, transcript_name, gene_name, gene_type, transcript_type
) |>
reframe(num_sites_in_tr = n()) |>
arrange(-num_sites_in_tr)
num_sites_in_transcripts |>
export_tsv(outdir = tabledir)
##
## Exported to: /Users/s-mitsutomi/My Drive (shuheimitsutomi@ric.u-tokyo.ac.jp)/Analysis/METTL2A/Tables/DRS_m3C_sites/num_sites_in_transcripts_2024-04-15.tsv
## # A tibble: 85 × 7
## seqname transcript_id transcript_name gene_name gene_type transcript_type
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 chrM ENST00000389680.2 MT-RNR1-201 MT-RNR1 Mt_rRNA Mt_rRNA
## 2 chr16 ENST00000343262.9 RPS2-201 RPS2 protein_… protein_coding
## 3 chrM ENST00000361789.2 MT-CYB-201 MT-CYB protein_… protein_coding
## 4 chrM ENST00000361453.3 MT-ND2-201 MT-ND2 protein_… protein_coding
## 5 chr11 ENST00000273550.… FTH1-201 FTH1 protein_… protein_coding
## 6 chrM ENST00000361381.2 MT-ND4-201 MT-ND4 protein_… protein_coding
## 7 chrM ENST00000361624.2 MT-CO1-201 MT-CO1 protein_… protein_coding
## 8 chr12 ENST00000392514.9 RPLP0-203 RPLP0 protein_… protein_coding
## 9 chrM ENST00000361739.1 MT-CO2-201 MT-CO2 protein_… protein_coding
## 10 chr2 ENST00000233143.6 TMSB10-201 TMSB10 protein_… protein_coding
## # ℹ 75 more rows
## # ℹ 1 more variable: num_sites_in_tr <int>
num_transcripts_with_m3Csites_groupedby_chr <-
num_sites_in_transcripts |>
group_by(seqname) |>
reframe(n = n()) |>
arrange(-n)
num_transcripts_with_m3Csites_groupedby_chr
## # A tibble: 25 × 2
## seqname n
## <chr> <int>
## 1 chr12 11
## 2 chrM 11
## 3 chr1 8
## 4 chr11 8
## 5 chr5 7
## 6 chr10 4
## 7 chr19 3
## 8 chr2 3
## 9 chr7 3
## 10 chr8 3
## # ℹ 15 more rows
percent_m3CRNAs_in_chr <-
num_transcripts_with_m3Csites_groupedby_chr |>
filter(grepl('chr', seqname)) |>
left_join(num_detected_transcripts_in_chromosomes) |>
mutate(percent_m3CRNAs_in_chr = 100 * n / num_detected_transcripts_in_chr) |>
arrange(-percent_m3CRNAs_in_chr)
## Joining with `by = join_by(seqname)`
percent_m3CRNAs_in_chr
## # A tibble: 23 × 4
## seqname n num_detected_transcripts_in_chr percent_m3CRNAs_in_chr
## <chr> <int> <int> <dbl>
## 1 chrM 11 23 47.8
## 2 chr12 11 2105 0.523
## 3 chr5 7 1733 0.404
## 4 chr11 8 2422 0.330
## 5 chrX 3 913 0.329
## 6 chr10 4 1384 0.289
## 7 chr1 8 3605 0.222
## 8 chr8 3 1408 0.213
## 9 chr18 1 525 0.190
## 10 chr20 2 1054 0.190
## # ℹ 13 more rows
percent_m3CRNAs_in_chr_barplot <-
percent_m3CRNAs_in_chr |>
ggplot(aes(
x = reorder(seqname, percent_m3CRNAs_in_chr),
y = percent_m3CRNAs_in_chr)) +
geom_bar(stat = 'identity') +
coord_flip() +
labs(x = '', y = '% of transcripts\nwith m3C sites')
percent_m3CRNAs_in_chr_barplot |>
ggsave_multiple_formats(
width = 4, height = 6, fontsize = 7, outdir = figdir)
num_sites_in_transcripts |>
ggplot(aes(x = reorder(seqname, num_sites_in_tr), y = num_sites_in_tr)) +
geom_point() +
coord_flip()
donutplot_genetype <- function(df) {
df |>
add_yrange() |>
ggplot(aes(
xmin = 2, xmax = 4, ymin = ymin, ymax = ymax,
fill = genetype2, colour = genetype2
)) +
geom_rect() +
coord_polar(theta = 'y') +
ggrepel::geom_text_repel(
aes(label = genetype2, y = (ymin + ymax) / 2), x = 1
) +
xlim(c(-1,4)) +
scale_fill_manual(values = c('#0099ff', '#ff9900', '#ff0099', '#9900ff')) +
scale_color_manual(values = c('#0099ff', '#ff9900', '#ff0099', '#9900ff')) +
theme_void()
}
num_transcripts_groupedby_genetype <-
num_sites_in_transcripts |>
mutate(
genetype2 = case_when(
gene_type == 'protein_coding' & seqname == 'chrM' ~ 'mt-mRNA',
gene_type == 'protein_coding' & seqname != 'chrM' ~ 'mRNA',
gene_type != 'protein_coding' & seqname == 'chrM' ~ 'mt-rRNA',
is.na(gene_type) ~ 'unannotated gene'
)
) |>
group_by(genetype2) |>
calc_percentage() |>
add_yrange()
num_transcripts_groupedby_genetype
## # A tibble: 4 × 5
## genetype2 n percentage ymax ymin
## <chr> <int> <dbl> <dbl> <dbl>
## 1 mRNA 72 84.7 0.847 0
## 2 mt-mRNA 9 10.6 0.953 0.847
## 3 mt-rRNA 2 2.35 0.976 0.953
## 4 unannotated gene 2 2.35 1 0.976
num_transcripts_groupedby_genetype_donut <-
num_transcripts_groupedby_genetype |>
donutplot_genetype()
num_transcripts_groupedby_genetype_donut |>
ggsave_multiple_formats(
width = 5, height = 5, fontsize = 7, outdir = figdir
)